#pragma once


#include <iostream> 
#include <type_traits> 
#include <natten/dtypes.cuh> 
namespace natten { 
namespace cuda { 
namespace naive { 
void na1d_pn_cuda_naive_double_cm_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_double_cm_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_float_cm_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_float_cm_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_half_cm_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_half_cm_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_bfloat16_cm_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_cuda_naive_bfloat16_cm_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na2d_pn_cuda_naive_double_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_double_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_double_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_double_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_float_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_float_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_float_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_float_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_half_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_half_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_half_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_half_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_bfloat16_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_bfloat16_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_bfloat16_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_cuda_naive_bfloat16_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_double_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_float_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_half_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_cuda_naive_bfloat16_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  bool is_grad,
  void * query_ptr,
  void * key_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na1d_pn_bias_cuda_naive_double_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_bias_cuda_naive_float_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_bias_cuda_naive_half_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_pn_bias_cuda_naive_bfloat16_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na2d_pn_bias_cuda_naive_double_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_bias_cuda_naive_float_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_bias_cuda_naive_half_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_pn_bias_cuda_naive_bfloat16_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na3d_pn_bias_cuda_naive_double_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_bias_cuda_naive_float_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_bias_cuda_naive_half_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_pn_bias_cuda_naive_bfloat16_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * query_ptr,
  void * key_ptr,
  void * bias_ptr,
  void * attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na1d_nn_cuda_naive_double_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_double_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_float_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_float_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_half_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_half_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_bfloat16_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_nn_cuda_naive_bfloat16_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na2d_nn_cuda_naive_double_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_double_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_double_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_double_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_float_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_float_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_float_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_float_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_half_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_half_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_half_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_half_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_bfloat16_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_bfloat16_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_bfloat16_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_nn_cuda_naive_bfloat16_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_double_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_float_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_half_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_nn_cuda_naive_bfloat16_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * value_ptr,
  void * output_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na1d_in_cuda_naive_double_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_double_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_float_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_float_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_half_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_half_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_bfloat16_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_in_cuda_naive_bfloat16_cm_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na2d_in_cuda_naive_double_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_double_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_double_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_double_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_float_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_float_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_float_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_float_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_half_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_half_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_half_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_half_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_bfloat16_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_bfloat16_cm_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_bfloat16_cm_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_in_cuda_naive_bfloat16_cm_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_double_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_float_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_half_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_0_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_0_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_0_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_1_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_1_0_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_1_1_0(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_in_cuda_naive_bfloat16_cm_1_1_1(
  int32_t cc,
  cudaStream_t stream,
  void * attn_ptr,
  void * d_output_ptr,
  void * d_value_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na1d_rpbgrad_cuda_naive_double_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_rpbgrad_cuda_naive_float_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_rpbgrad_cuda_naive_half_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na1d_rpbgrad_cuda_naive_bfloat16_cm_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t length,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  const std::tuple<int32_t>& kernel_size,
  const std::tuple<int32_t>& dilation);

void na2d_rpbgrad_cuda_naive_double_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_rpbgrad_cuda_naive_float_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_rpbgrad_cuda_naive_half_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na2d_rpbgrad_cuda_naive_bfloat16_cm_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  const std::tuple<int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t>& dilation);

void na3d_rpbgrad_cuda_naive_double_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_rpbgrad_cuda_naive_float_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_rpbgrad_cuda_naive_half_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);

void na3d_rpbgrad_cuda_naive_bfloat16_cm_0_0_0(
  int32_t cc,
  cudaStream_t stream,
  void * d_bias_ptr,
  void * d_attn_ptr,
  int32_t batch_size,
  int32_t heads,
  int32_t depth,
  int32_t height,
  int32_t width,
  int32_t dim,
  int64_t attn_stride_0,
  int64_t attn_stride_1,
  int64_t attn_stride_2,
  int64_t attn_stride_3,
  int64_t attn_stride_4,
  const std::tuple<int32_t, int32_t, int32_t>& kernel_size,
  const std::tuple<int32_t, int32_t, int32_t>& dilation);



} // namespace natten 
} // namespace cuda 
} // namespace naive 

